eda

Last modified 12:22 AM on May 17, 2016. This document, R session image, version history, knitr cache, figures, and other associated datasets are located in /inside/grotto/blin/trna-markers/luad/predict/.

library(GenomicRanges)
library(RColorBrewer)
library(pheatmap)
library(plyr)
load('/inside/home/blin/grotto/trna-markers/process-reads/luad-counts.RData')
load('/inside/home/blin/grotto/trna-markers/process-reads/luad-metadata.RData')
load('/inside/grotto/blin/data/hg19-srnas.RData')

excludeExtraneousGenes = function(counts, threshold = 0.96) {
  cor = cor(t(counts))
  cor[lower.tri(cor)] = NA # remove duplicates (x-y and y-x)
  cor_genes = data.frame(cbind(which(!is.na(cor), arr.ind = TRUE), na.omit(as.vector(cor))))
  cor_genes$row = rownames(cor)[cor_genes$row]
  cor_genes$col = colnames(cor)[cor_genes$col]
  cor_genes = cor_genes[cor_genes$V3 != 1 & cor_genes$V3 > threshold, ]
  if (nrow(cor_genes) == 0) return(rownames(cor))
  extraneous_genes = c()
  ok_genes = c()
  for (i in 1:nrow(cor_genes)) {
    gene1 = cor_genes$row[i]
    gene2 = cor_genes$col[i]
    if (!(gene1 %in% extraneous_genes) & !(gene1 %in% ok_genes)) {
      if (!(gene2 %in% extraneous_genes) & !(gene2 %in% ok_genes)) {
        ok_genes = c(ok_genes, gene1)
        extraneous_genes = c(extraneous_genes, gene2)
      }
      else {
        extraneous_genes = c(extraneous_genes, gene1)
      }
    }
    else {
      extraneous_genes = c(extraneous_genes, gene2)
    }
  }
  ok_genes
}

filterVariance = function(counts, var = 0.5) {
  gene_var = sapply(data.frame(t(counts)), var)
  names(gene_var) = rownames(counts)
  counts[which(gene_var > quantile(gene_var, 0.5)), ]
}

Default metadata for all heatmaps

counts = luad_adjusted_counts[rownames(luad_adjusted_counts) %in% srnas[srnas$class != 'tRNA']$tx_name, ]

heatmap_metadata = luad_metadata[luad_metadata$barcode %in% colnames(counts), ]
# subtype
load('/inside/grotto/blin/trna-markers/luad/cluster-subtypes/clusters.RData')
clusters$subtype[!clusters$known] = "Unknown"
heatmap_metadata$subtype = as.factor(clusters$subtype[match(substr(heatmap_metadata$barcode, 1, 12), clusters$barcode)])
# stage
levels(heatmap_metadata$stage) = list(I = c("Stage I", "Stage IA", "Stage IB"), II = c("Stage II", "Stage IIA", "Stage IIB"), III = c("Stage III", "Stage IIIA", "Stage IIIB"), IV = "Stage IV", Unknown = '[Not Available]')
# n stage
levels(heatmap_metadata$n_stage) = list(N0 = "N0", N1 = "N1", N2 = "N2", N3 = "N3", Unknown = c("NX", "[Not Available]"))
# smoker
heatmap_metadata$smoker = as.factor(heatmap_metadata$smoker)
levels(heatmap_metadata$smoker) = list(Smoker = 1, Nonsmoker = 0)
# heatmap annotation
levels(heatmap_metadata$sample_type) = list(TP = "TP", NT = "NT")
annot_col = data.frame(stage = heatmap_metadata$stage, subtype = heatmap_metadata$subtype, n_stage = heatmap_metadata$n_stage, sample_type = heatmap_metadata$sample_type, smoker = heatmap_metadata$smoker)
rownames(annot_col) = colnames(counts)
annot_row = data.frame(sRNA = as.factor(srnas[match(rownames(counts), srnas$tx_name), ]$class))
rownames(annot_row) = rownames(counts)
annot_colors = list(stage = setNames(c(brewer.pal(4, "Set1"), "grey"), levels(heatmap_metadata$stage)),
                    subtype = setNames(c(brewer.pal(3, "Set2"), "grey"), levels(heatmap_metadata$subtype)),
                    n_stage = setNames(c(brewer.pal(4, "Set3"), "grey"), levels(heatmap_metadata$n_stage)),
                    sample_type = setNames(c("#EF8A62", "#67A9CF"), levels(heatmap_metadata$sample_type)),
                    smoker = setNames(c("#8dd3c7", "#ffffb3", "grey"), levels(heatmap_metadata$smoker)),
                    sRNA = setNames(c(brewer.pal(8, "Paired"), "grey"), levels(annot_row$sRNA)))

Expression heatmap

df = log1p(counts)
df = df[rownames(df) %in% excludeExtraneousGenes(df), ]
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8,  show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(9, "Reds")) 
plot of chunk expression-heatmap

Fold change heatmaps

Fold change is relative to matched tumor-normal pairs or median normal expression.

getLog2FC = function(counts, metadata) {
  counts = counts - min(counts) + 1
  paired_tp_samples = which(metadata$sample_type == "TP" & metadata$paired)
  log2fc = data.frame(numeric(nrow(counts)))
  median_nt_expression = apply(counts[, paired_tp_samples+1], 1, median)
  for (i in 1:ncol(counts)) {
    sample_metadata = metadata[i, ]
    if (sample_metadata$sample_type == "NT") next
    tp_counts = counts[, i]
    if (sample_metadata$paired) nt_counts = counts[, i+1]
    else nt_counts = median_nt_expression
    log2fc = cbind(log2fc, log2(tp_counts) - log2(nt_counts))
  }
  colnames(log2fc) = metadata[metadata$sample_type == "TP", ]$barcode
  log2fc[, -1]
}
log2fc = getLog2FC(counts, luad_metadata)

General sRNA heatmap

metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(log2fc), ]
pheatmap(log2fc, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk log2fc-heatmap-1

# filter out genes by correlation
genelist = excludeExtraneousGenes(log2fc)
df = log2fc[rownames(log2fc) %in% genelist, ]
kable(data.frame(Original = table(srnas[match(rownames(log2fc), srnas$tx_name), ]$class), Noncorrelated = table(srnas[match(genelist, srnas$tx_name), ]$class)))
Original.Var1 Original.Freq Noncorrelated.Var1 Noncorrelated.Freq
genomic-tRF-3 762 genomic-tRF-3 94
genomic-tRF-5 790 genomic-tRF-5 132
miRNA 1529 miRNA 79
snoRNA 397 snoRNA 25
tRF-1 215 tRF-1 6
tRF-3 340 tRF-3 49
tRF-5 360 tRF-5 21
tRF-i 395 tRF-i 32
# filter metadata
metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(df), ]
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk log2fc-heatmap-2
# filter out genes by fold change variance
df = filterVariance(df, 0.5)
kable(data.frame(Noncorrelated = table(srnas[match(genelist, srnas$tx_name), ]$class), HiVariance = table(srnas[match(rownames(df), srnas$tx_name), ]$class)))
Noncorrelated.Var1 Noncorrelated.Freq HiVariance.Var1 HiVariance.Freq
genomic-tRF-3 94 genomic-tRF-3 39
genomic-tRF-5 132 genomic-tRF-5 68
miRNA 79 miRNA 51
snoRNA 25 snoRNA 19
tRF-1 6 tRF-1 1
tRF-3 49 tRF-3 24
tRF-5 21 tRF-5 12
tRF-i 32 tRF-i 5
# filter metadata
metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(df), ]
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk log2fc-heatmap-2

MicroRNA heatmap

All miRNAs

df = log2fc[srnas[match(rownames(log2fc), srnas$tx_name), ]$class == "miRNA", ]
# heatmap annotation
annot_colors2 = annot_colors
annot_colors2$sRNA = NA
pheatmap(df, annotation_col = annot_col, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu"))
plot of chunk mirna-heatmap-1

Filtered miRNA heatmap

# filter for gene variance and extraneous genes
df = df[rownames(df) %in% excludeExtraneousGenes(df), ]
df = filterVariance(df)
pheatmap(df, annotation_col = annot_col, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu"))
plot of chunk mirna-heatmap-2

Selected miRNAs

Li et al. has a list of "significant" miRNAs, let's see if their prediction holds up in terms of log(fold change):

df = log2fc[c("hsa-mir-196b", "hsa-mir-205", "hsa-mir-519a-1", "hsa-mir-31", "hsa-mir-7-1", "hsa-mir-193b", "hsa-mir-766", "hsa-mir-187", "hsa-let-7c", "hsa-mir-331", "hsa-mir-133a-1", "hsa-mir-375", "hsa-mir-323b", "hsa-mir-99a", "hsa-mir-101-1", "hsa-mir-374b"), ]
pheatmap(df, annotation_col = annot_col, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, cluster_rows = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk mirna-heatmap-li

tRF heatmap

All tRFs

df = log2fc[srnas[match(rownames(log2fc), srnas$tx_name), ]$class %in% c("tRF-1", "tRF-3", "tRF-5", "genomic-tRF-3", "genomic-tRF-5", "tRF-i"), ]

# heatmap annotation
annot_row2 = data.frame(tRF = as.factor(srnas[match(rownames(df), srnas$tx_name), ]$class))
rownames(annot_row2) = rownames(df)
annot_colors2 = annot_colors
annot_colors2$tRF = setNames(brewer.pal(6, "Paired"), levels(annot_row2$tRF))
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row2, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk trf-heatmap-1

Filtered tRF heatmap

df = log2fc[srnas[match(rownames(log2fc), srnas$tx_name), ]$class %in% c("tRF-1", "tRF-3", "tRF-5", "tRF-i"), ]

# heatmap annotation
annot_row2 = data.frame(tRF = as.factor(srnas[match(rownames(df), srnas$tx_name), ]$class))
rownames(annot_row2) = rownames(df)
annot_colors2 = annot_colors
annot_colors2$tRF = setNames(brewer.pal(4, "Set1"), levels(annot_row2$tRF))
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row2, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk trf-heatmap-2

# filter for gene variance and extraneous genes
df = df[rownames(df) %in% excludeExtraneousGenes(df), ]
df = filterVariance(df)

# heatmap annotation
annot_row2 = data.frame(tRF = as.factor(srnas[match(rownames(df), srnas$tx_name), ]$class))
rownames(annot_row2) = rownames(df)
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row2, annotation_colors = annot_colors2, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk trf-heatmap-3

Sort samples by subtype

df2 = df[, metadata[order(metadata$subtype), ]$barcode]
df2 = df2[, metadata[order(metadata$subtype), ]$subtype %in% c("PP", "PI", "TRU")]
annot_col2 = annot_col[colnames(df2), ]
pheatmap(df2, annotation_col = annot_col2, annotation_row = annot_row2, annotation_colors = annot_colors2, cluster_cols = FALSE, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu"))  
plot of chunk trf-heatmap-subtype

Sort samples by stage

df2 = df[, metadata[order(metadata$stage), ]$barcode]
df2 = df2[, metadata[order(metadata$stage), ]$stage %in% c("I", "II", "III", "IV")]
annot_col2 = annot_col[colnames(df2), ]
pheatmap(df2, annotation_col = annot_col2, annotation_row = annot_row2, annotation_colors = annot_colors2, cluster_cols = FALSE, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu"))
plot of chunk trf-heatmap-stage

## Saving search path..
## Saving list of loaded packages..
## Saving all data...
## Done.